Load Data and EDA

df <- read.xls('../data/Data_Cortex_Nuclear.xls')
df <- df %>% 
  mutate(Genotype = factor(Genotype), Treatment = factor(Treatment), 
         Behavior = factor(Behavior), class = factor(class))

Description

  • In this dataset is 1080 mice

  • We can group mice by:

    • genotype: Control, Ts65Dn
    • treatment: Memantine, Saline
    • behavior: C/S, S/C
    • class: c-CS-m, c-CS-s, c-SC-m, c-SC-s, t-CS-m, t-CS-s, t-SC-m, t-SC-s
  • Balance of these groups:

Genotype Balance
Control 0.528
Ts65Dn 0.472
Treatment Balance
Memantine 0.528
Saline 0.472
Behavior Balance
C/S 0.486
S/C 0.514
class Balance
c-CS-m 0.139
c-CS-s 0.125
c-SC-m 0.139
c-SC-s 0.125
t-CS-m 0.125
t-CS-s 0.097
t-SC-m 0.125
t-SC-s 0.125
  • Complete observations:

552 observations is fully complete. If we want to know complete observations by each feature:

colSums(!is.na(df))
##         MouseID        DYRK1A_N         ITSN1_N          BDNF_N           NR1_N 
##            1080            1077            1077            1077            1077 
##          NR2A_N          pAKT_N         pBRAF_N       pCAMKII_N         pCREB_N 
##            1077            1077            1077            1077            1077 
##          pELK_N          pERK_N          pJNK_N          PKCA_N          pMEK_N 
##            1077            1077            1077            1077            1077 
##          pNR1_N         pNR2A_N         pNR2B_N        pPKCAB_N          pRSK_N 
##            1077            1077            1077            1077            1077 
##           AKT_N          BRAF_N        CAMKII_N          CREB_N           ELK_N 
##            1077            1077            1077            1077            1062 
##           ERK_N         GSK3B_N           JNK_N           MEK_N          TRKA_N 
##            1077            1077            1077            1073            1077 
##           RSK_N           APP_N      Bcatenin_N          SOD1_N          MTOR_N 
##            1077            1077            1062            1077            1077 
##           P38_N         pMTOR_N         DSCR1_N         AMPKA_N          NR2B_N 
##            1077            1077            1077            1077            1077 
##         pNUMB_N        RAPTOR_N         TIAM1_N        pP70S6_N          NUMB_N 
##            1077            1077            1077            1077            1080 
##         P70S6_N        pGSK3B_N         pPKCG_N          CDK5_N            S6_N 
##            1080            1080            1080            1080            1080 
##        ADARB1_N    AcetylH3K9_N          RRP1_N           BAX_N           ARC_N 
##            1080            1080            1080            1080            1080 
##         ERBB4_N          nNOS_N           Tau_N          GFAP_N         GluR3_N 
##            1080            1080            1080            1080            1080 
##         GluR4_N          IL1B_N         P3525_N        pCASP9_N         PSD95_N 
##            1080            1080            1080            1080            1080 
##          SNCA_N     Ubiquitin_N pGSK3B_Tyr216_N           SHH_N           BAD_N 
##            1080            1080            1080            1080             867 
##          BCL2_N           pS6_N         pCFOS_N           SYP_N       H3AcK18_N 
##             795            1080            1005            1080             900 
##          EGR1_N        H3MeK4_N          CaNA_N        Genotype       Treatment 
##             870             810            1080            1080            1080 
##        Behavior           class 
##            1080            1080

BDNF_N difference

data_summary <- function(data, varname, groupnames){
  summary_func <- function(x, col){
    c(mean = mean(x[[col]], na.rm=TRUE),
      sd = sd(x[[col]], na.rm=TRUE))
  }
  data_sum <- plyr::ddply(data, groupnames, .fun=summary_func,
                  varname)
  data_sum <- plyr::rename(data_sum, c("mean" = varname))
 return(data_sum)
}
dff <- data_summary(df, 'BDNF_N', 'class')
ggplot(dff, aes(class, BDNF_N)) +
  geom_line() +
  geom_pointrange(aes(ymin=BDNF_N-sd, ymax=BDNF_N+sd)) +
  ggtitle('BDNF_N expression by group')

aov_test <- summary(aov(BDNF_N ~ class, df))

BDNF_N expression significantly depends on class (F = 18.816, p value = 8.857^{-24}, df 1 = 7, df 2 = 1069).

ERBB4_N linear model

proteins <- df %>% 
              na.omit() %>% 
              select(where(is.numeric))
fit_full <- lm(ERBB4_N ~ ., proteins)
fit_null <- lm(ERBB4_N ~ 1, proteins)
scope = list(lower = fit_null, upper = fit_full)
optimal_fit <- step(fit_null, scope = scope, direction = "forward", trace = FALSE)
lm_summary <- summary(optimal_fit)
lm_summary
## 
## Call:
## lm(formula = ERBB4_N ~ ARC_N + pGSK3B_Tyr216_N + AcetylH3K9_N + 
##     SYP_N + IL1B_N + TRKA_N + BAX_N + pGSK3B_N + BAD_N + BRAF_N + 
##     DSCR1_N + P3525_N + PSD95_N + GluR4_N + ERK_N + pAKT_N + 
##     pBRAF_N + MTOR_N + NR1_N + pNR2B_N + pMTOR_N + NR2B_N + DYRK1A_N + 
##     Tau_N + pCREB_N + SHH_N + pCASP9_N + pJNK_N + pERK_N + pCAMKII_N + 
##     pPKCG_N + pPKCAB_N + ADARB1_N + BDNF_N + RRP1_N + S6_N, data = proteins)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.02238 -0.00324 -0.00021  0.00322  0.03189 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      0.008659   0.005377    1.61  0.10795    
## ARC_N            0.145701   0.050472    2.89  0.00406 ** 
## pGSK3B_Tyr216_N  0.026663   0.007038    3.79  0.00017 ***
## AcetylH3K9_N     0.006171   0.004449    1.39  0.16601    
## SYP_N            0.018280   0.008131    2.25  0.02499 *  
## IL1B_N           0.029177   0.008189    3.56  0.00040 ***
## TRKA_N          -0.000138   0.011409   -0.01  0.99037    
## BAX_N           -0.149413   0.026642   -5.61  3.3e-08 ***
## pGSK3B_N         0.154068   0.027653    5.57  4.1e-08 ***
## BAD_N           -0.037307   0.021855   -1.71  0.08842 .  
## BRAF_N           0.020529   0.008539    2.40  0.01656 *  
## DSCR1_N         -0.014621   0.007513   -1.95  0.05219 .  
## P3525_N          0.065503   0.018012    3.64  0.00030 ***
## PSD95_N          0.023259   0.002641    8.81  < 2e-16 ***
## GluR4_N         -0.101521   0.022379   -4.54  7.1e-06 ***
## ERK_N            0.005924   0.001636    3.62  0.00032 ***
## pAKT_N           0.075043   0.020293    3.70  0.00024 ***
## pBRAF_N         -0.048064   0.026807   -1.79  0.07356 .  
## MTOR_N           0.037294   0.011647    3.20  0.00145 ** 
## NR1_N           -0.019723   0.003809   -5.18  3.2e-07 ***
## pNR2B_N          0.017637   0.004804    3.67  0.00027 ***
## pMTOR_N         -0.007278   0.006338   -1.15  0.25133    
## NR2B_N           0.011634   0.007566    1.54  0.12471    
## DYRK1A_N        -0.006803   0.005693   -1.19  0.23267    
## Tau_N            0.066630   0.014181    4.70  3.4e-06 ***
## pCREB_N         -0.063442   0.019910   -3.19  0.00153 ** 
## SHH_N            0.041662   0.015214    2.74  0.00639 ** 
## pCASP9_N         0.008525   0.002296    3.71  0.00023 ***
## pJNK_N          -0.040549   0.016700   -2.43  0.01552 *  
## pERK_N          -0.017658   0.005448   -3.24  0.00127 ** 
## pCAMKII_N        0.001064   0.000390    2.73  0.00662 ** 
## pPKCG_N         -0.006362   0.001525   -4.17  3.5e-05 ***
## pPKCAB_N         0.005765   0.001802    3.20  0.00146 ** 
## ADARB1_N        -0.002210   0.001349   -1.64  0.10205    
## BDNF_N           0.036349   0.016161    2.25  0.02492 *  
## RRP1_N          -0.014890   0.009302   -1.60  0.11005    
## S6_N             0.006740   0.004459    1.51  0.13129    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.00578 on 515 degrees of freedom
## Multiple R-squared:  0.858,  Adjusted R-squared:  0.848 
## F-statistic: 86.5 on 36 and 515 DF,  p-value: <2e-16

Adjusted R-squared is 0.848, df is 515 with 37 predictors. So, this is a just good model, but not excellent.

Let’s compare full model and optimal model:

anova(fit_full, optimal_fit)
## Analysis of Variance Table
## 
## Model 1: ERBB4_N ~ DYRK1A_N + ITSN1_N + BDNF_N + NR1_N + NR2A_N + pAKT_N + 
##     pBRAF_N + pCAMKII_N + pCREB_N + pELK_N + pERK_N + pJNK_N + 
##     PKCA_N + pMEK_N + pNR1_N + pNR2A_N + pNR2B_N + pPKCAB_N + 
##     pRSK_N + AKT_N + BRAF_N + CAMKII_N + CREB_N + ELK_N + ERK_N + 
##     GSK3B_N + JNK_N + MEK_N + TRKA_N + RSK_N + APP_N + Bcatenin_N + 
##     SOD1_N + MTOR_N + P38_N + pMTOR_N + DSCR1_N + AMPKA_N + NR2B_N + 
##     pNUMB_N + RAPTOR_N + TIAM1_N + pP70S6_N + NUMB_N + P70S6_N + 
##     pGSK3B_N + pPKCG_N + CDK5_N + S6_N + ADARB1_N + AcetylH3K9_N + 
##     RRP1_N + BAX_N + ARC_N + nNOS_N + Tau_N + GFAP_N + GluR3_N + 
##     GluR4_N + IL1B_N + P3525_N + pCASP9_N + PSD95_N + SNCA_N + 
##     Ubiquitin_N + pGSK3B_Tyr216_N + SHH_N + BAD_N + BCL2_N + 
##     pS6_N + pCFOS_N + SYP_N + H3AcK18_N + EGR1_N + H3MeK4_N + 
##     CaNA_N
## Model 2: ERBB4_N ~ ARC_N + pGSK3B_Tyr216_N + AcetylH3K9_N + SYP_N + IL1B_N + 
##     TRKA_N + BAX_N + pGSK3B_N + BAD_N + BRAF_N + DSCR1_N + P3525_N + 
##     PSD95_N + GluR4_N + ERK_N + pAKT_N + pBRAF_N + MTOR_N + NR1_N + 
##     pNR2B_N + pMTOR_N + NR2B_N + DYRK1A_N + Tau_N + pCREB_N + 
##     SHH_N + pCASP9_N + pJNK_N + pERK_N + pCAMKII_N + pPKCG_N + 
##     pPKCAB_N + ADARB1_N + BDNF_N + RRP1_N + S6_N
##   Res.Df    RSS  Df Sum of Sq    F Pr(>F)
## 1    476 0.0165                          
## 2    515 0.0172 -39 -0.000743 0.55   0.99

p value is 0.988. This mean that optimal model and full model have not difference then optimal model is better than the full one.

PCA

proteins_pca <- rda(proteins, scale = TRUE)
  • Biplot with symmetric scaling:
biplot(proteins_pca)

  • Biplot of correlations:
biplot(proteins_pca, scaling = "species", display = "species")

  • Biplot of distances:
biplot(proteins_pca, scaling = "sites", display = "sites")

pca_summary <- summary(proteins_pca)
pca_result <- as.data.frame(pca_summary$cont)
plot_data <- as.data.frame(t(pca_result[c("Proportion Explained"),][1:24]))
plot_data$Component <- seq(1:nrow(plot_data))

ggplot(plot_data, aes(Component, `Proportion Explained`)) + 
  geom_bar(stat = "identity") + 
  scale_x_discrete(limits = seq(1:nrow(plot_data))) +
  ylab('Proportion Explained (95 %)') +
  xlab('Principal Component') +
  theme_bw() +
  ggtitle('Proportion Explained by Principal Component')

plot_data <- as.data.frame(pca_summary$sites[,1:3])
plot_ly(x = plot_data$PC1, 
        y = plot_data$PC2, 
        z = plot_data$PC3, 
        type = "scatter3d", 
        mode = "markers") %>% 
  layout(title = 'PCA 3D')

Differential expression

Form ExpressionSet:

proteins_dif <- df %>% 
  select(where(is.numeric))
proteins_dif <- normalizeQuantiles(log2(proteins_dif + 1))
row.names(proteins_dif) <- df$MouseID

factor_dif <- df['Genotype']

expr_data <- as.matrix(t(proteins_dif))

pheno_data <- df['Genotype']
row.names(pheno_data) <- df$MouseID
pheno_metadata <- data.frame(
  labelDescription = c('Genotype'), 
  row.names = c('Genotype'))
pheno_data <- new("AnnotatedDataFrame", 
                  data = pheno_data, 
                  varMetadata = pheno_metadata)

feature_data <- data.frame(Protein = rownames(expr_data))
rownames(feature_data) <- rownames(expr_data)
feature_metadata <- data.frame(
  labelDescription = c("Protein"),
  row.names = c("Protein"))
f_data <- new("AnnotatedDataFrame", 
              data = feature_data, 
              varMetadata = feature_metadata)

experiment_data <-
  new("MIAME",
      name = "Katheleen J. Gardiner",
      lab = "lab",
      contact = "katheleen.gardiner@ucdenver.edu",
      title = "Down syndrome detectable signals in the nuclear fraction of cortex",
      abstract = "Abstract",
      other = list(notes = "Dataset from Katheleen J. Gardiner et al. 2015"))

exp_set <-
  ExpressionSet(assayData = expr_data,
                phenoData = pheno_data,
                featureData = f_data,
                experimentData = experiment_data)

Linear model for limma:

X <- model.matrix(~ Genotype, pData(exp_set))
fit <- lmFit(exp_set, design = X, method = "robust", maxit = 1000)
efit <- eBayes(fit)

All differential-expressed proteins table with a = 0.05:

numGenes <- nrow(exprs(exp_set))
topTable(efit, number = numGenes) %>% 
  `rownames<-`(seq_len(numGenes)) %>% 
  filter(adj.P.Val < 0.05)
##            Protein   logFC AveExpr     t  P.Value adj.P.Val         B
## 1            APP_N  0.1054   0.634 19.62 2.55e-76  1.96e-74 162.89613
## 2            Tau_N  0.0712   0.634 12.80 1.14e-35  4.38e-34  69.72518
## 3          ITSN1_N  0.0711   0.634 12.39 1.25e-33  3.21e-32  65.05321
## 4             S6_N  0.0703   0.634 12.12 2.78e-32  5.36e-31  61.97179
## 5     AcetylH3K9_N  0.0645   0.634 11.59 8.70e-30  1.34e-28  56.27899
## 6          pPKCG_N  0.0531   0.634  8.71 7.86e-18  1.01e-16  29.01689
## 7         DYRK1A_N  0.0507   0.634  8.51 4.06e-17  4.46e-16  27.39939
## 8          GluR3_N -0.0489   0.634 -8.31 2.15e-16  2.07e-15  25.75515
## 9            SYP_N -0.0460   0.634 -7.72 2.07e-14  1.77e-13  21.25617
## 10          MTOR_N -0.0455   0.634 -7.52 9.26e-14  7.13e-13  19.78595
## 11         AMPKA_N -0.0430   0.634 -7.17 1.20e-12  8.37e-12  17.27590
## 12        pGSK3B_N  0.0391   0.634  6.64 4.38e-11  2.81e-10  13.74767
## 13         pNR2A_N -0.0412   0.634 -6.61 5.52e-11  3.27e-10  13.51638
## 14           P38_N -0.0393   0.634 -6.38 2.36e-10  1.30e-09  12.09956
## 15          NR2B_N -0.0378   0.634 -6.19 7.58e-10  3.83e-09  10.96156
## 16         pCREB_N  0.0370   0.634  6.19 7.96e-10  3.83e-09  10.91563
## 17          BRAF_N  0.0348   0.634  5.86 5.73e-09  2.53e-08   8.99247
## 18           ARC_N -0.0343   0.634 -5.84 6.25e-09  2.53e-08   8.91154
## 19           pS6_N -0.0343   0.634 -5.84 6.25e-09  2.53e-08   8.91154
## 20       H3AcK18_N  0.0376   0.634  5.72 1.33e-08  5.10e-08   8.26580
## 21          IL1B_N -0.0317   0.634 -5.31 1.25e-07  4.47e-07   6.00462
## 22          pNR1_N -0.0329   0.634 -5.31 1.28e-07  4.47e-07   5.97713
## 23         pMTOR_N -0.0316   0.634 -5.14 3.08e-07  1.03e-06   5.12923
## 24 pGSK3B_Tyr216_N  0.0303   0.634  5.02 5.68e-07  1.82e-06   4.53705
## 25        pP70S6_N  0.0306   0.634  5.00 6.57e-07  2.02e-06   4.39986
## 26          NR2A_N -0.0292   0.634 -4.71 2.69e-06  7.97e-06   3.04040
## 27          CDK5_N  0.0267   0.634  4.47 8.53e-06  2.43e-05   1.93786
## 28        RAPTOR_N -0.0276   0.634 -4.44 9.81e-06  2.70e-05   1.80157
## 29          EGR1_N -0.0296   0.634 -4.28 1.97e-05  5.22e-05   1.24297
## 30         TIAM1_N  0.0257   0.634  4.16 3.37e-05  8.66e-05   0.62739
## 31          NUMB_N  0.0248   0.634  4.07 4.93e-05  1.22e-04   0.26650
## 32        H3MeK4_N  0.0281   0.634  3.97 7.62e-05  1.83e-04  -0.00279
## 33          pRSK_N  0.0240   0.634  3.92 9.25e-05  2.16e-04  -0.32679
## 34         pNR2B_N -0.0233   0.634 -3.84 1.26e-04  2.86e-04  -0.62141
## 35          SNCA_N -0.0232   0.634 -3.82 1.40e-04  3.08e-04  -0.72000
## 36           AKT_N -0.0216   0.634 -3.49 4.94e-04  1.06e-03  -1.90119
## 37          RRP1_N  0.0210   0.634  3.45 5.69e-04  1.18e-03  -2.03298
## 38        CAMKII_N -0.0194   0.634 -3.20 1.42e-03  2.88e-03  -2.87920
## 39          TRKA_N  0.0190   0.634  3.10 1.94e-03  3.84e-03  -3.16780
## 40          pELK_N  0.0181   0.634  2.93 3.47e-03  6.68e-03  -3.69993
## 41           NR1_N -0.0179   0.634 -2.88 3.99e-03  7.40e-03  -3.82627
## 42         P3525_N  0.0179   0.634  2.88 4.03e-03  7.40e-03  -3.83845
## 43        ADARB1_N -0.0175   0.634 -2.79 5.35e-03  9.57e-03  -4.09472
## 44         DSCR1_N  0.0157   0.634  2.56 1.06e-02  1.86e-02  -4.70565
## 45         GSK3B_N  0.0151   0.634  2.44 1.47e-02  2.51e-02  -4.99283
## 46         pCFOS_N -0.0139   0.634 -2.19 2.89e-02  4.83e-02  -5.54608

Let’s draw a heatmap:

my_list <- topTable(efit, coef = 2, n = 20)
dif_exp_set <- exp_set[fData(exp_set)$Protein %in% my_list$Protein, ]

dat <- as.matrix(exprs(dif_exp_set))
rownames(dat) <- fData(dif_exp_set)$Protein
colnames(dat) <- NULL

pal_green <- colorpanel(75, low = "black", mid = "darkgreen", high = "yellow")
heatmap.2(dat, col = pal_green, scale = "none", key=TRUE, symkey = FALSE, density.info = "none", trace = "none", cexRow = 0.9, cexCol = 1, margins = c(1, 6), keysize = 1, key.par = list(mar = c(2, 1, 2, 1)), key.xlab = '')
First 20 differential-expressed proteins in 1080 samples

First 20 differential-expressed proteins in 1080 samples